import pandas as pd
import numpy as np
import json
df = pd.read_csv(r'C:\Users\fengfeng.qiu\Downloads\570d391e-d64d-11ec-b2b1-00163e012765_20220518100329.csv', delimiter='\x01')

df = df.dropna(axis=0,subset = ['airesponse', "reason"])

def fuc(x):
    x = json.loads(x)
    return x["contraband_infos"][0]['sure_contraband']


def change_contraband_tag(item):
    """将reason里面确定为违禁品的，改一下备注，改成统一的备注"""
    if "近期承运违规货源信息" in item["reason"]:
        return "确认是违禁品"
    elif "近期发布违规货源信息" in item["reason"]:
        return "确认是违禁品"
    elif '词错误，转文本' in item['reason']:
        return "关键词错误,语音转文本出错"
    elif '词错误，是转文本' in item['reason']:
        return "关键词错误,语音转文本出错"
    elif "无法确定货物为危化品" in item["reason"]:
        return "词正确,无法确定货物为危化品"
    elif "无法确定是否为危化品" in item["reason"]:
        return "词正确,无法确定货物为危化品"
    elif "词正确，用户仅聊到" in item["reason"]:
        return "词正确,用户仅聊到"
    elif "听不懂" in item["reason"]:
        return "语音或者文本,听不懂"
    elif "未找到违禁词" in item["reason"]:
        return "未找到违禁词"
    elif "词正确，用户仅仅聊到" in item["reason"]:
        return "词正确,用户仅聊到"
    elif "水性的" in item["reason"]:
        return "词正确,无法确定货物为危化品"
    elif "货主打错字，实际是化肥" in item["reason"]:
        return "关键词错误,语音转文本出错"
    elif "非危险品货源" in item["reason"]:
        return "词正确,无法确定货物为危化品"
    elif "非违禁品货源" in item["reason"]:
        return "词正确,无法确定货物为危化品"
    else:
        return item["reason"]

def count_contraband(series):
    for l in series['airesponse']:
        if l in contraband_dict:
            contraband_dict[l].append(labels.index(series['reason']))
        else:
            contraband_dict[l] = []
            contraband_dict[l].append(labels.index(series['reason']))
            
df['airesponse'] = df['airesponse'].apply(fuc)
df['reason'] = df.apply(change_contraband_tag,axis=1)

labels = list(df['reason'].unique())

contraband_dict = {}

df.apply(count_contraband,axis=1)
for ind in contraband_dict:
    curr = []
    for i in range(len(labels)):
        curr.append(contraband_dict[ind].count(i))
    contraband_dict[ind] = curr
out = pd.DataFrame(contraband_dict, index=labels).T
out["总数"] = out.sum(axis=1)
out["准确率"] = out["确认是违禁品"]/out["总数"]

out.reset_index(inplace=True)
out.rename(columns={"index":"contraband_word"},inplace=True)
print(out.columns)
out = out[["contraband_word","总数","确认是违禁品","准确率",'词正确,无法确定货物为危化品','关键词错误,语音转文本出错']]
out = out.sort_values(by="总数",ascending=False)
out.to_excel("危禁品词准确率统计.xlsx")
print(out)









